\documentclass[compress, brown, professionalfonts]{beamer} 
\usetheme{Singapore} % Beamer Theme 
\usecolortheme{lily} % Beamer Color Theme 
\usefonttheme[options]{serif}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{epstopdf}
\usepackage{array}
\usepackage{epsfig,color}
\usepackage{movie15}
\newcommand{\muhat}{\hat{\mu}}


\begin{document}

\title{Reward Optimization in the Primate Brain: A POMDP Model of Decision Making Under Uncertainty}
\author{Yanping Huang}
\institute{University of Washington}
\date{\today}

\begin{frame}
\titlepage
\end{frame}

\section{Model Setup}

\begin{frame}
  \frametitle{Random Dots Motion Discrimination Tasks}
   \begin{center}
     \includegraphics[scale=0.5]{MonkeyRandomDots.png}
   \end{center}
\end{frame}


\begin{frame}
 \frametitle{Two Alternative Choices}
   \begin{center}
     \includegraphics[scale=0.6]{monkey2.png}
   \end{center}
\end{frame}

\begin{frame}
 \frametitle{Random Dots Motion Stimuli}
   \begin{center}
     \includegraphics[scale=0.6]{monkey3.png}
   \end{center}
\end{frame}

\begin{frame}
  \frametitle{Motion Strength with $c = 6.4\%$  }
   \begin{center}
     \begin{figure}[ht]
     \includemovie[poster,text={\includegraphics[width=10cm,height=8cm]{movie_thumb.png}},autoplay,mouse=true]{10cm}{8cm}{c_R_6_4.avi}
\end{figure}   
\end{center}
\end{frame}

\begin{frame}
  \frametitle{Discriminate between Right and Left}
   \begin{center}
     \includegraphics[scale=0.6]{monkey4.png}
   \end{center}
\end{frame}


\begin{frame}
  \frametitle{Model Setup and Notations}
   In each trial, the experimenter chooses a fixed motion
strength $c (0 \le c \le +1)$ and a underlying direction $d$
\begin{itemize}
\item $0$ corresponds to complete random motion  
\item  $1$ corresponds to all dots moving in coherent direction
\item Intermediate
values of $c$ represent a corresponding fraction of dots moving
leftward or rightward. 
\end{itemize}
But $c$ is {\em unknown} to the agent.
\end{frame}



\begin{frame}
  \frametitle{Motion Strength with $c = 25.6\%$  }
   \begin{center}
     \begin{figure}[ht]
     \includemovie[poster,text={\includegraphics[width=10cm,height=8cm]{movie_thumb.png}},autoplay,mouse=true]{10cm}{8cm}{c_L_25_6.avi}
\end{figure}   
\end{center}
\end{frame}


\begin{frame}
  \frametitle{Modeling the task}
  The random dots as coins.
  \begin{itemize}
  \item Right and Left $\equiv$  Head and Tail
  \item Observing a sequence of heads and tails
  \item Figuring out if the coin is head-biased or tail-biased $d \in \{-1, 1\}$.
  \end{itemize}
\end{frame}




\begin{frame}
  \frametitle{Model Setup}
\begin{itemize}
\item $n$ number of dots on the screen
\item $o_t \in \{0, \ldots, n\}$, number of heads
\item $n - o_t$, number of tails
\item $o_t$ follows a Binomial distribution $o_t \sim \mathrm{Bino}(n, \frac{cd+1}{2})$
\end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Inference}
  \begin{itemize}
  \item Given $o_{1:t}$, infer $d$.
  \item $P(d) \to P(d|o_1) \to ldots \to P(d|o_{1:t})$ using Bayes rule
    \begin{align*}
      P(d|o_{1:t}) &= {\color{red} P(o_t|d)} P(d|o_{1:t-1})/P(o_t|o_{1:t-1}) \\
      Posterior &=  {\color{red} Likelihood} \times Prior / Normalization
    \end{align*}
  \item However, $P(o_t|d)$ depends on unknown $c$
\end{itemize}
\end{frame}


\begin{frame}
  \frametitle{Belief State}
\begin{itemize}
   \item Therefore, infer $c$ and $d$ together
    \begin{align*}
      P(d,c|o_{1:t}) = P(o_t|d,c) P(d,c|o_{1:t-1}) / P(o_t|o_{1:t-1})               
   \end{align*}
   \item We called $P(d,c|o_{1:t}) = b_t$ the belief. 
   \item The agent doesn't know $c,d$, but maintains a belief state over $c,d$. 
\end{itemize}
\end{frame}


\begin{frame}
\frametitle{Belief Update}
\begin{itemize}
   \item Let $P(d,c)$ follows a beta distribution (conjugate prior of binomial). 
  \item Given $o_{1:t}$, we update $b_t$ in the following way:
    \begin{align*}
      Beta(\alpha_0,\beta_0) &\overset{o_1}{\to} Beta(\alpha_0 + o_1,\beta_0 + n - o_1) \\
 & \overset{o_2}{\to} Beta(\alpha_0 + o_1 + o_2,\beta_0 + n - o_1 + n - o_2)\\
 \ldots & \overset{o_t}{\to} Beta(\alpha_0 + \sum_{\tau=1}^t o_t ,\beta_0 + \sum_{\tau=1}^t (n - o_t) )
    \end{align*}   
  \item $b_t \sim Beta(\alpha,\beta)$
    \begin{itemize}
     \item $\alpha = m_R + \alpha_0$, $m_R = \sum_{\tau=1}^t o_t$
    \item $\beta = m_L + \beta_0$,  $m_L = \sum_{\tau=1}^t (n - o_t)$
   \end{itemize}
   \item $b_t$ depends only on the number of rightward and leftward moving dots $m_R$ and $m_L$.
  \end{itemize}
\end{frame}


\begin{frame}
  \frametitle{A Markov decision process}
     Actions $a_t\in \{A_R, A_L, A_S\}$
       \begin{itemize}
       \item $A_R$:  Rightward eye movement and terminate the trial.
       \item $A_L$:  Leftward eye movement and terminate the trial.
       \item $A_S$:  Wait for $o_{t+1}$
       \end{itemize}
       \begin{center}
         \includegraphics[scale=0.25]{beliefUpdate.jpg}
       \end{center}
\end{frame}

\begin{frame}
  \frametitle{Reward model}
      \begin{itemize}
      \item $R_P > 0$: positive reward for a correct choice, $i.e.$, a rightward eye movement $A_R$ when $d = 1$ or a leftward eye movement $A_L$ when $d = -1$.
      \item $R_N \le 0$: negative reward (penalty) or nothing for an incorrect choice
      \item $R_S = -1$:  a unit penalty for each random dot sample
      \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Optimal Policy}
The goal is to find an optimal policy $\pi^*$ that maximizes the expected total future reward 
      \begin{eqnarray*}
      v^* = \max_{\pi}{\textrm E}[\sum_{k=1}^{\infty} r_{t+k} | b_t]  
      \end{eqnarray*}
\begin{center}
\includegraphics[scale=0.3]{policy.jpg}
\end{center}
\end{frame}


\begin{frame}
  \frametitle{Decision Making $c = 25.6\%$}
Given $o_{1:t}$, we maintain a sequence of belief state $b_1,b_2,\ldots, b_t$.
\begin{center}
\includegraphics[scale=0.1]{policy_0_256.jpg}
\end{center}
\end{frame}


\begin{frame}
  \frametitle{Decision Making with $c = 0\%$}
\begin{center}
\includegraphics[scale=0.1]{policy_0.jpg}
\end{center}
\end{frame}

\begin{frame}
  \frametitle{Speed and Accuracy}
Let $\Pr[t,R|c]$ be the joint probability mass
functions that the agent makes a right at time $t$:
\begin{center}
 \begin{eqnarray*}
  \Pr[t,R|c] &=& \Pr[b_t \in \Pi^R_t , b_{t-1} \in \Pi^S_{t-1}, \ldots, b_{1} \in \Pi^S_{1}|c] \\ 
\Pr[R|c] &=& \sum_{t=1}^{\infty} \Pr[t,R|c] \quad \quad \quad \mbox{Accuracy}\\
  RT_R(c) &=& \sum_{t=1}^{\infty} t \frac{\Pr[t,R|c]}{\Pr[R|c]} \quad \quad \quad  \mbox{Speed}
 \end{eqnarray*}
\end{center}
\end{frame}

\begin{frame}
  \frametitle{Speed and Accuracy}
\begin{center}
\includegraphics[scale=0.1]{PCRT.jpg}
\end{center}
\end{frame}

\begin{frame}
  \frametitle{Unsolved Puzzle}
\large  
When the agent is given a prior over $d$, what is the optimal policy then?
\end{frame}

\begin{frame}
\frametitle{Soln one: $b_0 =$ Prior }
\begin{center}
\includegraphics[scale=0.06]{prior_wrong.jpg}
\includegraphics[scale=0.15]{prior_exp.jpg}
\end{center}
\end{frame}


\begin{frame}
  \frametitle{Summary}
  \begin{itemize}
  \item Our model predicts psychometric and chronometric functions that are quantitatively close to those observed in monkeys. 
\item  We show through analytical derivations and numerical results that the optimal threshold for selecting overt actions is a declining function of time.
  \end{itemize}
\end{frame}


\begin{frame}
  \frametitle{Future Work}
  \begin{itemize}
\item  A normative explanation for the dynamic bias signal
assumption  in which prior information plays an
increasingly important role in the decision process over time.
  \item A neural implementation based on Temporal Difference(TD) learning.
\item  Include action ``bailout'' for post decision wagering.
\item  Time varying motion strength.
\item  POMDP model on more general graphical models.
  \end{itemize}
\end{frame}




\begin{frame}
  \frametitle{Acknowledgement}
  \begin{center}
\includegraphics[scale=0.4]{ack.jpg}
\end{center}
\end{frame}

\end{document}
